package xiaoa.java.spider.parse;

import java.net.URI;


import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;



/**
 * 
 * @author xiaoa
 * @date 2016年11月5日 下午3:07:57
 * @version V1.0
 *
 */
public class ParseHtml {
	
	/**
	 * 从url内容获取标题
	 * @Title: getTitle
	 * @param content
	 * @return
	 * @author xiaoa
	 */
	 public static  String getTitle(String content) {
        if (content == null) {
            return null;
        }
        Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
        Matcher matcher = pattern.matcher(content);
        String title = null;
        if (matcher.find()) {
            title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
        }
        return title;
    }
	 
	 /**
	  * 获取主站点uri
	  * @Title: getMainUrl
	  * @param uri
	  * @return
	  * @author xiaoa
	  */
	 public static String  getMainUrl(URI  uri){
		 
		 if (uri == null){
			 return null;
		 }
		 
		 int port = uri.getPort();
		 
		 return uri.getScheme()  + "://" + uri.getHost() + (port == -1 ? "" : ":" + port );
		 
	 }
	 

   /***
    * 从url内容中获取存在的url链接
    * @Title: getUrls
    * @param content
    * @return
    * @author xiaoa
    */
    public static  List<String> getUrls(Element doc , URI uri) {
        if (doc == null ) {
            return null;
        }
        
        Elements  urs =  doc.select("a");
        
        List<String>  urlList = new ArrayList<>();
        
        for ( Element u :  urs){
        	String url = u.attr("href");
        
        	url = splitJointUrl(url, uri);
        	if (url == null || url.isEmpty()){
        		continue;
        	}
        	
        	urlList.add(url);
        }
        
        
        return urlList;
    }
	
    
    /**
     * 拼接url
     * @Title: splitJointUrl
     * @param src
     * @param uri
     * @return
     * @author xiaoa
     */
    public static String splitJointUrl(String src ,URI uri){
    	
    	
    	if (src == null || src.trim().isEmpty()){
    		return null;
    	}
    	
    	if (src.toLowerCase().startsWith("http:") || src.toLowerCase().startsWith("https:")){
    		// 不处理
    	}else {
    		if (src.startsWith("/")){
    		    String mainUrl = getMainUrl(uri);
    		    src = mainUrl + src;
    		}else {
    			src = uri.toString() + (uri.toString().endsWith("/") ? "" : "/") + src;
    		}
    	}
    	
    	return src;
    	
    	
    }
    
    
    
    
    /**
     * 从url内容中获取存在的url链接
     * 
     * @Title: getUrls
     * @param html
     * @param uri
     * @return
     * @author xiaoa
     */
    public static  List<String> getUrls(String html , URI uri) {
        if (html == null  || html.isEmpty()) {
            return null;
        }
        
        return getUrls(Jsoup.parse(html) , uri);
    }
	
    
    
    /**
     * 获取图片
     * @Title: getImgs
     * @param doc
     * @param uri
     * @return
     * @author xiaoa
     */
    public static  List<String> getImgs(Document doc , URI uri) {
        if (doc == null ) {
            return null;
        }
        
        Elements  urs =  doc.select("img");
        
        List<String>  urlList = new ArrayList<>();
        
        String mainUrl = getMainUrl(uri);
        
        for ( Element u :  urs){
        	String url = u.attr("src");
        	
        	if (url == null || url.trim().isEmpty()){
        		continue;
        	}
        	
        	if (url.toLowerCase().startsWith("http:") || url.toLowerCase().startsWith("https:")){
        		// 不处理
        	}else {
        		
        		if (url.startsWith("/")){
        			url = mainUrl + url;
        		}else {
        			url = uri.toString() + (uri.toString().endsWith("/") ? "" : "/") + url;
        		}
        	}
        	
        	urlList.add(url);
        }
        
        
        return urlList;
    }
	
    
    /**
     * 获取图片链接
     * @Title: getImgs
     * @param html
     * @param uri
     * @return
     * @author xiaoa
     */
    public static  List<String> getImgs(String html , URI uri) {
        if (html == null  || html.isEmpty()) {
            return null;
        }
        
        return getImgs(Jsoup.parse(html) , uri);
    }
    
	

}